import os
import glob
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import xgboost as xgb
from scipy.stats import beta
from sklearn import preprocessing
pd.set_option('display.max_columns', None) # display all columns
pd.set_option('display.max_colwidth', -1)
daywise_search = pd.read_csv('goa/daywise_search.csv', parse_dates=['searchDay'])
daywise_search.columns = ['searchDay','tid_count']
daywise_search = daywise_search.sort_values('searchDay')
fig = px.line(daywise_search, x='searchDay', y='tid_count', title='#unique tids each day')
fig.show()
startDate = pd.to_datetime('2019-12-01 00:00:00+05:30')
endDate = pd.to_datetime('2020-02-29 00:00:00+05:30')
print(startDate, endDate)
def dropNullColumns(df):
return df.dropna(axis=1, how='all')
def dropColumnsWithSameValue(df):
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
return df.drop(columns=cols_to_drop)
def memory_usage(df):
return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))
questioncount: # of questions asked on platformanswercount: unverified data (140 answers for 1 question)hotelreplycount: # of hotel replies on user reviews (unusable without review count)approvedimagecount: can not measure quality based on this metric. pic can be of a torn bedsheet and approvedugc_data = pd.read_csv('goa/ugc_data.csv', dtype={'localityid':'str','voyagerid':'str'})
location_data = ugc_data[['voyagerid','localityid','localityname']]
ugc_data.drop(columns=['localityid','localityname'], inplace=True)
ugc_data = dropNullColumns(ugc_data)
ugc_data = dropColumnsWithSameValue(ugc_data)
ugc_data
ugc_data_columns = ugc_data.count() / ugc_data.shape[0]
ugc_data_columns = ugc_data_columns[ugc_data_columns > 0.4].index.to_list()
ugc_data = ugc_data[ugc_data_columns]
ugc_data
recentwindowrating_median = ugc_data['recentwindowrating'].replace(0., np.NaN).median()
relativerating_median = ugc_data['relativerating'].replace(0., np.NaN).median()
recommendation_val_median = ugc_data['recommendation_val'].replace(0., np.NaN).median()
ugc_data.loc[(ugc_data['recentwindowrating'] == 0.) | ugc_data['recentwindowrating'].isna(), 'recentwindowrating'] = recentwindowrating_median
ugc_data.loc[(ugc_data['relativerating'] == 0.) | ugc_data['relativerating'].isna(), 'relativerating'] = relativerating_median
ugc_data.loc[(ugc_data['recommendation_val'] == 0.) | ugc_data['recommendation_val'].isna(), 'recommendation_val'] = recommendation_val_median
ugc_data.loc[(ugc_data['reviewcount'] <= 0.) | ugc_data['reviewcount'].isna(), 'reviewcount'] = 1.
ugc_data.plot(x='reviewcount', y='detailedrating', kind='hexbin', sharex=False,
xscale='log', cmap='YlGnBu', gridsize=8, mincnt=1,
title="detailed rating by review count")
ugc_data.loc[ugc_data['detailedrating'].isna(), 'detailedrating'] = 0.
ugc_data['bayesian_rating'] = ((50*3.5) + (ugc_data['reviewcount']*ugc_data['detailedrating']))/(50+ugc_data['reviewcount'])
ugc_data
for starRating in ugc_data['starrating'].unique():
if pd.notna(starRating):
ugc_data['{}star'.format(int(starRating))] = ugc_data['starrating'] == starRating
ugc_data.drop(columns=['amenitiesrating','cleanlinessrating','fdrating','locrating','sqrating','vfmrating',
'questioncount','answercount','hotelreplycount','approvedimagecount','reviewcount','starrating',
'filteredreviewcounts_negative','filteredreviewcounts_positive','hotelrating','detailedrating'],
inplace=True)
ugc_data.set_index('voyagerid', inplace=True)
ugc_data
Data is collected in augur db from voyager api
voyager_meta_data = pd.read_csv('goa/htl_meta_data.csv',
dtype={'hotel_voyager_id':'object', 'city_id':'object'},
parse_dates=['created_date','updated_time'])
hotelTypes = set(voyager_meta_data.hotel_list_type[~voyager_meta_data.hotel_list_type.isnull()].str.cat(sep=',').split(','))
for hotelType in hotelTypes:
voyager_meta_data[hotelType] = voyager_meta_data['hotel_list_type'].str.contains(hotelType)
for propertyType in voyager_meta_data.property_type.unique():
if (pd.notna(propertyType)):
voyager_meta_data[propertyType.lower()] = voyager_meta_data['property_type'] == propertyType
voyager_meta_data = dropNullColumns(voyager_meta_data)
location_data = voyager_meta_data[['hotel_voyager_id','hotel_name','longitude','latitude']].merge(location_data, how='outer',
left_on='hotel_voyager_id',
right_on='voyagerid')
# add oyo
voyager_meta_data['oyo'] = voyager_meta_data['chain_code'].str.startswith('oyo').fillna(False)
# ignoring chains since there are 74 chains
voyager_meta_data.drop(columns=['city_name','hotel_name','chain_code','is_chain','longitude','latitude','is_ingo',
'updated_time','is_active','property_type','focus_mapping','is_fraud',
'created_date','hotel_list_type','col_segment'], inplace=True)
voyager_meta_data.fillna(False, inplace=True)
voyager_meta_data = dropColumnsWithSameValue(voyager_meta_data)
voyager_meta_data.set_index('hotel_voyager_id', inplace=True)
voyager_meta_data
Extracted from voyager data (from augur db) and ugc query (from redshift)
location_data['vhid'] = location_data.hotel_voyager_id.combine_first(location_data.voyagerid)
location_data = location_data[['vhid','hotel_name','localityname','localityid','latitude','longitude']]
location_data
# locality features - which are popular
px.set_mapbox_access_token(open(".mapbox_token").read())
fig = px.scatter_mapbox(location_data[location_data['localityid'].isin(location_data.groupby('localityid').agg({'vhid':'count'}).sort_values('vhid',ascending=False).head(20).index.to_list())],
lat="latitude", lon="longitude", color="localityname", text='hotel_name')
fig.show()
nps_data = pd.read_csv('goa/nps_data.csv', dtype={'voyagerid':'object'})
nps_data = nps_data.groupby('voyagerid').agg({'promoters':'sum','nutral':'sum','detractors':'sum'}).reset_index()
nps_data['nps_count'] = (nps_data['promoters'] + nps_data['nutral'] + nps_data['detractors'])
nps_data['nps_score'] = (nps_data['promoters'] - nps_data['detractors']) / nps_data['nps_count']
nps_data['non_detractor'] = (nps_data['promoters'] + nps_data['nutral']) / nps_data['nps_count']
nps_data.sort_values('nps_count', ascending=False)
# nps data not enough
nps_data.plot(x='nps_count', y='nps_score', kind='hexbin', sharex=False,
cmap='YlGnBu', gridsize=6, mincnt=1,
title="nps score by count")
nps_data.plot(x='nps_count', y='non_detractor', kind='hexbin', sharex=False,
cmap='YlGnBu', gridsize=6, mincnt=1,
title="nps score by count")
nps_data['bayesian_nps'] = ((10*0) + (nps_data['promoters'] - nps_data['detractors']))/(10+nps_data['nps_count'])
nps_data['bayesian_non_detractor'] = ((10*0.5) + (nps_data['promoters'] + nps_data['nutral']))/(10+nps_data['nps_count'])
nps_data = nps_data[['voyagerid','nps_count','nps_score','bayesian_nps','non_detractor','bayesian_non_detractor']]
nps_data = nps_data[['voyagerid','bayesian_nps']].set_index('voyagerid')
nps_data
static_data = ugc_data.merge(voyager_meta_data, left_index=True, right_index=True, how='outer')
static_data = static_data.merge(nps_data, left_index=True, right_index=True, how='outer')
static_data['recommendation_val'] = static_data['recommendation_val'].fillna(recommendation_val_median)
static_data['recentwindowrating'] = static_data['recentwindowrating'].fillna(recentwindowrating_median)
static_data['relativerating'] = static_data['relativerating'].fillna(relativerating_median)
static_data['bayesian_rating'] = static_data['bayesian_rating'].fillna(3.5)
static_data['bayesian_nps'] = static_data['bayesian_nps'].fillna(0.)
static_data.fillna(False, inplace=True)
scaler = preprocessing.MinMaxScaler()
static_data = pd.DataFrame(scaler.fit_transform(static_data), index=static_data.index, columns=static_data.columns)
static_data = static_data.reset_index()
static_data
Check for contact detail feature for other cities. In this city, all hotels have contact details updated
ingo_content_data = pd.read_csv('goa/ingo_content_score.csv',
dtype={'hotelcode':'object','voyagerid':'object','cityid':'object'})
ingo_content_data['modifiedon'] = pd.to_datetime(ingo_content_data['modifiedon'], unit='ms')
ingo_content_data['modified_date'] = ingo_content_data['modifiedon'].dt.date
ingo_content_data = dropNullColumns(ingo_content_data)
ingo_content_data = dropColumnsWithSameValue(ingo_content_data)
ingo_content_data.fillna(0., inplace=True)
ingo_content_data.drop(columns=['hotelcode','hoteltype','totalcontentscore'], inplace=True)
ingo_content_data
ingo_content_data_deduped = ingo_content_data.groupby(['voyagerid','modified_date']).agg({'modifiedon':'max'})[2:].reset_index().drop(columns=['modified_date'])
ingo_content_data_deduped = ingo_content_data.merge(ingo_content_data_deduped, on=['voyagerid','modifiedon']).drop(columns=['modifiedon'])
ingo_content_data_deduped
ingo_content_data_init = ingo_content_data_deduped[ingo_content_data_deduped['modified_date']<=pd.to_datetime('2020-01-01')].groupby('voyagerid').agg({'modified_date':'max'}).reset_index()
ingo_content_data_init = ingo_content_data_deduped.merge(ingo_content_data_init, on=['voyagerid','modified_date'])
ingo_content_data_init['modified_date'] = pd.to_datetime('2020-01-01').date()
ingo_content_data_init
ingo_content_data_deduped = ingo_content_data_deduped[ingo_content_data_deduped['modified_date']>pd.to_datetime('2020-01-01')]
ingo_content_data = pd.concat([ingo_content_data_init, ingo_content_data_deduped], axis=0, ignore_index=True)
ingo_content_data['modified_date'] = pd.to_datetime(ingo_content_data['modified_date'])
ingo_content_data.rename(columns={'voyagerid':'vhid','modified_date':'dt'}, inplace=True)
ingo_content_data
transactions = pd.read_csv('goa/transactions.csv', parse_dates=['bookingDate'], dtype={'vhid':'object'})
transactions.rename(columns={'transactions':'rn','bookingDate':'dt'}, inplace=True)
transactions
mean_asp_by_vhid = transactions.groupby('vhid').agg({'asp':'mean'})
mean_asp_by_date = transactions.groupby('dt').agg({'asp':'mean'})
Bayesian inference to calculate CTR/CR
x = detail_hits in case of CTR | bookings in case of CR
N = impressions in case of CTR | detail_hits in case of CR
x_mean = mean value of x across all hotels in the city for the day
N_mean = mean value of N across all hotels in the city for the day
alpha_coefficient = x_mean
beta_coefficient = N_mean - x_mean
Borrowed from here
# daily_funnel_data = pd.read_csv('goa/daily_funnel_data.csv', parse_dates=['searchDate'], dtype={'searchVhid':'object'})
# daily_funnel_data.rename(columns={'searchDate':'dt','searchVhid':'vhid'}, inplace=True)
# daily_funnel_data = daily_funnel_data.replace(0., np.NaN)
# funnel_data_grouped = daily_funnel_data.groupby('dt').agg({'searches':'mean','detailClicks':'mean','bookings':'mean'})
# daily_funnel_data = daily_funnel_data.merge(funnel_data_grouped, how='left', left_on='dt', right_index=True).fillna(0.)
# # cr and ctr are calculated using bayesian inference
# daily_funnel_data['ctr'] = daily_funnel_data.apply(lambda d: beta.mean(d.detailClicks_y + d.detailClicks_x, d.searches_y - d.detailClicks_y + d.searches_x - d.detailClicks_x), axis=1)
# daily_funnel_data['cr'] = daily_funnel_data.apply(lambda d: beta.mean(d.bookings_y + d.bookings_x, d.detailClicks_y - d.bookings_y + d.detailClicks_x - d.bookings_x), axis=1)
# # daily_funnel_data = daily_funnel_data[['date','vhid','ctr','cr']]
# daily_funnel_data.rename(columns={'searches_x':'searches','detailClicks_x':'detailClicks','bookings_x':'bookings',
# 'searches_y':'searches_mean','detailClicks_y':'detailClicks_mean','bookings_y':'bookings_mean'}, inplace=True)
# daily_funnel_data.to_csv('goa/daily_funnel_data_beta.csv', index=False)
# daily_funnel_data
daily_funnel_data = pd.read_csv('goa/daily_funnel_data_beta.csv', parse_dates=['dt'], dtype={'vhid':'object'})
daily_funnel_data
quantiles = daily_funnel_data.groupby('dt').agg({'cr':[lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)],
'ctr':[lambda x: np.quantile(x, 0.25), lambda x: np.quantile(x, 0.75)]})
quantiles.columns = ['cr_q1', 'cr_q2', 'ctr_q1', 'ctr_q2']
quantiles['cr_iqr'] = quantiles['cr_q2'] - quantiles['cr_q1']
quantiles['ctr_iqr'] = quantiles['ctr_q2'] - quantiles['ctr_q1']
quantiles['cr_l'] = quantiles['cr_q1'] - 1.5*quantiles['cr_iqr']
quantiles['cr_u'] = quantiles['cr_q2'] + 1.5*quantiles['cr_iqr']
quantiles['ctr_l'] = quantiles['ctr_q1'] - 1.5*quantiles['ctr_iqr']
quantiles['ctr_u'] = quantiles['ctr_q2'] + 1.5*quantiles['ctr_iqr']
quantiles = quantiles[['cr_l','cr_u','ctr_l','ctr_u']]
quantiles.head()
daily_funnel_data = daily_funnel_data.merge(quantiles, on=['dt'])
daily_funnel_data.loc[daily_funnel_data['ctr'] < daily_funnel_data['ctr_l'], 'ctr'] = daily_funnel_data['ctr_l']
daily_funnel_data.loc[daily_funnel_data['ctr'] > daily_funnel_data['ctr_u'], 'ctr'] = daily_funnel_data['ctr_u']
daily_funnel_data.loc[daily_funnel_data['cr'] < daily_funnel_data['cr_l'], 'ctr'] = daily_funnel_data['cr_l']
daily_funnel_data.loc[daily_funnel_data['cr'] > daily_funnel_data['cr_u'], 'ctr'] = daily_funnel_data['cr_u']
daily_funnel_data
cr_ctr = daily_funnel_data[daily_funnel_data['dt'] == pd.to_datetime('2020-01-04')]
ctr = (cr_ctr['detailClicks']/cr_ctr['searches']).values
beta_ctr = cr_ctr.ctr.values
cr = (cr_ctr['bookings']/cr_ctr['detailClicks']).fillna(0.).values
beta_cr = cr_ctr.cr.values
fig = ff.create_distplot([ctr, beta_ctr, cr, beta_cr], ['ctr','beta_ctr', 'cr', 'beta_cr'])
fig.show()
daily_funnel_data = daily_funnel_data[['dt','vhid','cr','ctr']]
daily_funnel_data
def weighted_sum(values, historical_weight=0.66, normalize_zero=True):
result = values.copy()
for index, curr in enumerate(values[1:]):
prev = result[index]
if normalize_zero:
result[index+1] = (historical_weight * prev) + ((1 - historical_weight) * curr)
else:
result[index+1] = prev if curr == 0 else (historical_weight * prev) + ((1 - historical_weight) * curr)
return result
session_start = pd.to_datetime('2020-01-01')
session_end = pd.to_datetime('2020-02-29')
def historical_weighted(df):
vhid = df.vhid.unique()[0]
df.drop(columns='vhid', inplace=True)
df.set_index('dt',inplace=True)
t_index = pd.date_range(start=session_start, end=session_end, freq='D', closed='left')
df = df.reindex(t_index).fillna(0.)
for col in df.columns:
if col == 'rn':
df[col] = df[col].cumsum()
elif col in ['cr','ctr']:
df[col] = weighted_sum(df[col])
elif col in ['asp']:
df[col] = weighted_sum(df[col], normalize_zero=False)
else:
df[col] = df[col]
return df
daily_data = daily_funnel_data.merge(transactions, on=['vhid','dt'], how='outer')
daily_data['rn'] = daily_data['rn'].fillna(0.)
daily_data = daily_data.merge(mean_asp_by_vhid, on=['vhid'], how='left')
daily_data.loc[daily_data['asp_x'].isna(), 'asp_x'] = daily_data['asp_y']
daily_data = daily_data.merge(mean_asp_by_date, on=['dt'], how='left')
daily_data.loc[daily_data['asp_x'].isna(), 'asp_x'] = daily_data['asp']
daily_data = daily_data.drop(columns=['asp_y','asp']).rename(columns={'asp_x':'asp'})
daily_data = daily_data.groupby('vhid').apply(historical_weighted)
daily_data = daily_data.reset_index().rename(columns={'level_1':'dt'})
daily_data = daily_data.merge(ingo_content_data, on='vhid', how='outer')
daily_data = daily_data[daily_data['dt_y'] < daily_data['dt_x']]
daily_data['t'] = (daily_data['dt_x'] - daily_data['dt_y'])
daily_data['rank'] = daily_data.groupby('vhid')['t'].rank(method='min')
min_rank = daily_data.groupby(['vhid','dt_x']).agg({'rank':'min'}).reset_index()
daily_data = daily_data.merge(min_rank, on=['vhid','dt_x'], how='left')
daily_data = daily_data[daily_data['rank_x'] == daily_data['rank_y']]
daily_data = daily_data.drop(columns=['dt_y','t','rank_x','rank_y']).rename(columns={'dt_x':'dt'})
scaler = preprocessing.MinMaxScaler()
daily_data = daily_data.set_index(['vhid','dt'])
daily_data = pd.DataFrame(scaler.fit_transform(daily_data), index=daily_data.index, columns=daily_data.columns)
daily_data = daily_data.reset_index()
# scaler = preprocessing.MinMaxScaler()
# daily_data = daily_data.set_index(['vhid','dt'])
# daily_data = pd.DataFrame(scaler.fit_transform(daily_data), index=daily_data.index, columns=daily_data.columns)
# daily_data = daily_data.reset_index()
daily_data
merged = static_data.merge(daily_data, how='inner', left_on='index', right_on='vhid').drop(columns=['index'])
merged['vhid'] = merged['vhid'].astype('int64')
merged['dt'] = merged['dt'].dt.date
merged
merged.dtypes
session_data = pd.read_csv('goa/session_data.csv', parse_dates=['searchDate'])
feb_data = session_data[session_data['searchDate']>=pd.to_datetime('2020-02-01')]
feb_data['date'] = feb_data['searchDate'].dt.date - datetime.timedelta(days=1)
# feb_data.to_csv('goa/feb_data.csv')
feb_data.shape
feb_data.dtypes
feb_data = feb_data.drop(columns=['Unnamed: 0'])
data = feb_data.merge(merged, left_on=['searchVhid','date'], right_on=['vhid','dt'])
data.drop(columns=['searchVhid','date','dt'], inplace=True)
data.set_index(['vhid','searchDate'], inplace=True)
# data.to_csv('goa/feb_data_merged.csv')
data
nunique = data.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 2].index
d = data.loc[:, data.columns not in [cols_to_drop]]
fig, ax = plt.subplots(figsize=(15,8))
sns.heatmap(d.corr(),
xticklabels=d.columns,
yticklabels=d.columns)
import seaborn as sns
fig, ax = plt.subplots(figsize=(15,8))
sns.heatmap(data.corr(),
xticklabels=data.columns,
yticklabels=data.columns)
data[]
booking_data = pd.read_csv('goa/bookings.csv',
dtype={'vhid':'object'},
parse_dates=['bookingTime','checkin','checkout'])
# booking_data['bookingDate'] = booking_data['bookingTime'].dt.date
booking_data = booking_data[booking_data['cancelled_bookings']<=0] # filter out cancelled bookings
booking_data = dropColumnsWithSameValue(booking_data)
booking_data['asp'] = booking_data['gmv'] / booking_data['confirmedBookings']
booking_data.drop(columns=['gmv'], inplace=True)
print(booking_data.shape)
booking_data.head()
# take ingo transaction for domestic
# path = r'/Users/saman.tamkeen/Desktop/goa/details/'
# all_files = glob.glob(path + "/*.csv")
# # all_files
# li = []
# for i, filename in enumerate(all_files):
# d = pd.read_csv(filename,
# names=['detailTime','detailId','vcid','flavour','checkin','checkout','pax','userId','email','trackingId','vhid'],
# dtype={'vcid':'object','vhid':'object','userId':'object'},
# parse_dates=['detailTime','checkin','checkout'])
# d = d[(d['detailTime'] >= startDate) & (d['detailTime'] <= endDate)]
# if d.shape[0] > 0:
# li.append(d)
# print(i)
# details_data = pd.concat(li, axis=0, ignore_index=True)
# print(details_data.shape)
details_data = pd.read_csv('goa/details.csv',
dtype={'vhid':'object'},
parse_dates=['detailTime','checkin','checkout'])
print(details_data.shape)
details_data.head()
detail2booking = details_data.merge(booking_data, how='left', on=['trackingId','vhid'])
detail2booking.shape
detail2booking = detail2booking[(pd.isna(detail2booking['bookingTime'])) |
((detail2booking['bookingTime'] > detail2booking['detailTime']) &
((detail2booking['bookingTime'] - detail2booking['detailTime']).dt.total_seconds() <= 60*60))]
detail2booking.shape
detail2booking['rank'] = detail2booking.groupby("bookingId")["detailTime"].rank(ascending=False, method='dense')
detail2booking = detail2booking[(pd.isna(detail2booking['bookingTime'])) | (detail2booking['rank'] == 1.)]
detail2booking.drop(columns=['flavour_y','rank'], inplace=True)
detail2booking.rename(columns={'flavour_x':'flavour',
'checkin_x':'checkinDetail',
'checkout_x':'checkoutDetail',
'checkin_y':'checkinBooking',
'checkout_y':'checkoutBooking'}, inplace=True)
detail2booking.shape
detail2booking.drop_duplicates(subset=['detailId'], inplace=True)
detail2booking.shape
before_memory = memory_usage(detail2booking)
# optimize storage
detail2booking['flavour'] = detail2booking['flavour'].astype('category')
# detail2booking['checkinDetail'] = detail2booking['checkinDetail'].dt.date
# detail2booking['checkoutDetail'] = detail2booking['checkoutDetail'].dt.date
detail2booking['vhid'] = detail2booking['vhid'].astype('int64')
# detail2booking['checkinBooking'] = detail2booking['checkinBooking'].dt.date
# detail2booking['checkoutBooking'] = detail2booking['checkoutBooking'].dt.date
detail2booking['rooms'] = detail2booking['rooms'].fillna(0).astype('int8')
detail2booking['adults'] = detail2booking['adults'].fillna(0).astype('int8')
detail2booking['children'] = detail2booking['children'].fillna(0).astype('int8')
detail2booking['status'] = detail2booking['status'].astype('category')
detail2booking['paymode'] = detail2booking['paymode'].fillna(0).astype('int8')
detail2booking['confirmedBookings'] = detail2booking['confirmedBookings'].fillna(0).astype('int8')
detail2booking['asp']= detail2booking['asp'].astype('float32')
after_memory = memory_usage(detail2booking)
print((before_memory - after_memory)/before_memory)
d2b = detail2booking[detail2booking['detailTime']>pd.to_datetime('2020-01-01 00:00:00+05:30')]
d2b.shape
d2b.to_csv('goa/detail2booking.csv', index=False)
detail2booking['asp'].round(2).max()
d2b.head()
d2b = pd.read_csv('goa/detail2booking.csv',
parse_dates=['detailTime','bookingTime','checkinDetail','checkoutDetail','checkinBooking','checkoutBooking'],
dtype={'flavour':'category','vhid':'int64','rooms':'int8','adults':'int8','children':'int8',
'status':'category','paymode':'int8','confirmedBookings':'int8','asp':'float32'})
d2b.shape
d2b.dtypes
# d2b['checkinDetail'] = d2b['checkinDetail'].dt.strftime('%Y%m%d').astype('int32')
# d2b['checkoutDetail'] = d2b['checkoutDetail'].dt.strftime('%Y%m%d').astype('int32')
# d2b['checkinBooking'] = pd.to_numeric(d2b['checkinBooking'].dt.strftime('%Y%m%d'), errors='coerce', downcast='integer').astype('int32')
# d2b['checkoutBooking'] = pd.to_numeric(d2b['checkoutBooking'].dt.strftime('%Y%m%d'), errors='coerce', downcast='integer').fillna(0).astype('int32')
# d2b['checkoutBooking'] = d2b['checkoutBooking'].dt.strftime('%Y%m%d', errors='coerce').astype('int32')
d2b['checkinBooking'] = d2b['checkinBooking'].fillna(0).astype('int32')
d2b.dtypes
d2b['checkinBooking'] = d2b['checkinBooking'].replace(0., np.NaN)
d2b['checkoutBooking'] = d2b['checkoutBooking'].replace(0., np.NaN)
d2b.to_csv('goa/detail2booking.csv', index=False)
pd.read_csv('/Users/saman.tamkeen/Desktop/goa/searches/part-00032-tid-361649420680128099-70bce0fc-603c-4aa9-9c5f-854b2a85f7db-3732-1-c000.csv')
# read in data
dtrain = xgb.DMatrix('agaricus.txt.train')
dtest = xgb.DMatrix('agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
preds
# uniqueBookings = detail2booking.merge(uniqueBookings, on=['bookingId','detailTime']).drop_duplicates(subset=['detailId'], keep='first')
# uniqueBookings.shape
d = detail2booking.drop_duplicates(subset='detailId',keep='first')
d.to_csv('goa/details_deduped.csv', index=False)
# d.head()
d.drop(columns=['vcid','userId','email']).to_csv('goa/details_deduped.csv', index=False)
uniqueBookings[uniqueBookings['bookingId']=='HTL24LQ54Q']
detail2booking.groupby('detailId').agg({'detailTime':'count'})
d = details_data.drop_duplicates()
print(d.shape)
d.to_csv('goa/details_deduped.csv', index=False)
booking_data.groupby('bookingId').agg({'bookingTime':'count'})
detail2booking.head()['detailTime'].dt.floor('H')
detail2booking.head()['detailTime']
detail2booking.bookingId.unique()
import json
def json_parser(data):
try:
return json.loads(data)
except Exception as ex:
return None
df = pd.read_csv('f1.csv',
names=['name','dob','stats'],
converters={'stats':CustomParser})
df
import json
json.loads('{"starRatings":[],"locationIds":[],"locationNames":[],"amenities":[],"propertyTypes":[],"tags":[],"chains":[],"priceRange":[],"offerText":[],"locationSearchForwardParams":[],"gosuggestSearchParamContainer":[{"Vhid":"660156700734546743","searchType":1}],"locationSearchParamContainer":[],"latLongSearchParamContainer":[],"UGCRange":[],"HappySubContextList":[]}')
s = pd.read_csv('/Users/saman.tamkeen/Desktop/searchSample.csv',
names=['vcid','flavour','checkIn','checkOut','pax','userId','email','trackingId','sort','filters',
'filterInput','algo','sessionId','searchTime','hotels'],
delimiter='|', escapechar='\\', error_bad_lines=False,
parse_dates=['searchTime'],
converters={'filters':json_parser,'filterInput':json_parser,'hotels':json_parser},
dtype={'flavour':'category','checkIn':'int32','checkout':'int32','sort':'category','algo':'category'})
s.drop(columns=['vcid','userId','email','sessionId'],inplace=True)
s.head()
s = pd.read_csv('/Users/saman.tamkeen/Desktop/goa/session_data/part-00080-tid-5326949392612942954-d4e02ebc-a40f-4a45-b84f-a75a7a3636aa-8058-1-c000.csv',
delimiter='|', escapechar='\\',
parse_dates=['searchTime','detailTime','bookingTime','checkIn','checkOut'],
converters={'filters':json_parser})
s['los'] = (s['checkOut'] - s['checkIn']).dt.days
s['ap'] = (s['checkIn'] - pd.to_datetime(s['searchTime'].dt.date)).dt.days
s['pax_bucket'] = np.select([(s['pax'] == '1-1_0'),(s['pax'] == '1-2_0')], ['solo', 'couple'], default='other')
s['is_relevant'] = s['detailTime'].notna()
s = s[['flavour','logged_in','los','ap','pax_bucket','searchVhid','is_relevant']]
def get_filter_value(r):
filterValue = {}
for f in r.filters_new:
try:
if f == 'gosuggestSearchParamContainer':
filterValue[f] = [k['Vhid'] for k in r.filterInput[f] if 'Vhid' in k]
elif f == 'locationSearchForwardParams':
filterValue[f] = [k['locationID'] if k['idType'] != 'ggl_id' else k['name'] for k in r.filterInput[f]]
elif f == 'locationSearchParamContainer': # contains google/voyager code
filterValue[f] = [k['locationId'] for k in r.filterInput[f] if 'locationId' in k]
else:
filterValue[f] = r.filterInput[f]
except:
print(f, r.filterInput[f])
filterValue[f] = r.filterInput[f]
return filterValue
# s = s[['filters','filterInput']]
s['filterValue'] = s.apply(get_filter_value, axis=1)
s.head()
# filterInput:struct
# starRatings:array
# element:float
# locationIds:array
# element:string
# locationNames:array
# element:string
# amenities:array
# element:string
# propertyTypes:array
# element:string
# tags:array
# element:string
# chains:array
# element:string
# hotelPriceBucket:float
# priceRange:array
# element:float
# hotelName:string
# isSlot:float
# payMode:float
# offerText:array
# element:string
# isFreeCancellation:float
# isReserveNowPayLater:float
# isGoBiz:float
# isPah:float
# isCoupleFriendly:float
# isBookedByContacts:float
# isEarlyCheckin:float
# PolygonId:string
# locationSearchForwardParams:array
# element:struct
# locationID:string
# latitude:float
# longitude:float
# radius:float
# idType:string
# name:string
# gosuggestSearchParamContainer:array
# element:struct
# Vhid:string
# searchType:long
# locationSearchParamContainer:array
# element:struct
# locationId:string
# idType:string
# latLongSearchParamContainer:array
# element:struct
# latitude:string
# longitude:string
# UGCRange:array
# element:float
# HappyContext:string
# HappySubContextList:array
# element:string
# HappyContextType:string
s[(s['filters'].str[0] == 'gosuggestSearchParamContainer') & (s['filterValue'].str.contains('Vhid'))]
def get_filters(filterInput):
if filterInput is None:
return []
return [k for k,v in filterInput.items() if not(v is None or (type(v) == list and (len(v) == 0 or sum([len(e) if type(e) == dict else 1 for e in v]) == 0)))]
# s['filters_new'] = s['filterInput'].apply(get_filters)
s[s['filters'] != s['filters_new']]
path = r'/Users/saman.tamkeen/Desktop/goa/session_data/'
all_files = glob.glob(path + "/*.csv")
li = []
errors = []
for i, filename in enumerate(all_files):
try:
s = pd.read_csv(filename,
delimiter='|', escapechar='\\',
parse_dates=['searchTime','detailTime','bookingTime','checkIn','checkOut'])
s['los'] = (s['checkOut'] - s['checkIn']).dt.days
s['searchDate'] = pd.to_datetime(s['searchTime'].dt.date)
s['ap'] = (s['checkIn'] - s['searchDate']).dt.days
s['pax_bucket'] = np.select([(s['pax'] == '1-1_0'),(s['pax'] == '1-2_0')], ['solo', 'couple'], default='other')
s['is_relevant'] = s['detailTime'].notna()
s = s[['searchDate','flavour','los','ap','pax_bucket','searchVhid','is_relevant']]
if s.shape[0] > 0:
li.append(s)
print(i, filename.split('/')[-1])
except:
errors.append(filename)
session_data = pd.concat(li, axis=0, ignore_index=True)
print(errors)
print(session_data.shape)
session_data.head()
session_data.to_csv('goa/session_data.csv', index=False)
pd.read_csv('/Users/saman.tamkeen/Desktop/goa/daily_funnel_data/part-00160-tid-8719775214049067280-83283836-dd94-488e-9fb0-686ff67620e3-7769-1-c000.csv',delimiter='|')
search_data_2 = pd.concat([search_data,search_data_2], axis=0, ignore_index=True)
print(search_data_2.shape)
path = r'/Users/saman.tamkeen/Desktop/goa/searches/'
all_files = glob.glob(path + "/*.csv")
li = []
for i, filename in enumerate(all_files):
print(i, filename.split('/')[-1])
os.remove('/Users/saman.tamkeen/Desktop/nohup.out')
from itertools import zip_longest
a = [1,2,3]
b = ['a','b','c']
c = []
[(_1,_2,_3) for _1,_2,_3 in zip_longest(a,b,c)]